##DDSAnalytics is an analytics company specializing in talent management solutions for Fortune 100 companies. Goal is to conduct analysis of existing employee data to highlight turnover
##Load Case Study 2 data set
#Read in Attrition Data
Attrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2_data.csv", header = TRUE)
NOAttrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Attrition.csv", header = TRUE)
AttritionNo = sqldf('
select *
from Attrition
where "Attrition" = "No"
')
AttritionYes = sqldf('
select *
from Attrition
where "Attrition" = "Yes"
')Parse and Summarize data
Attrition %>%
group_by(Attrition) %>%
summarize(MeanCompanysWorked = mean(NumCompaniesWorked),
MeanMonthlyIncome = mean(MonthlyIncome),
MeanYearsAtCompany = mean(YearsAtCompany),
MeanWorkAge = mean(Age),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanJobLevel = mean(JobLevel),
MeanPerformance = mean(PerformanceRating),
MeanYearsSincePromotion = mean(YearsInCurrentRole),
MeanSalaryHike = mean(PercentSalaryHike),
Total = n()) %>%
arrange(Attrition,Total)## # A tibble: 2 x 11
## Attrition MeanCompanysWorked MeanMonthlyIncome MeanYearsAtCompany MeanWorkAge
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 No 2.66 6702 7.30 37.4
## 2 Yes 3.08 4765. 5.19 33.8
## # … with 6 more variables: MeanTotalWorkingYears <dbl>, MeanJobLevel <dbl>,
## # MeanPerformance <dbl>, MeanYearsSincePromotion <dbl>, MeanSalaryHike <dbl>,
## # Total <int>
Plot relationships
AttritionStats = Attrition %>%
group_by(Attrition) %>%
summarize(
MeanAge = mean(Age),
MeanMonthlyIncome = mean(MonthlyIncome),
MeanWorkingYears = mean(TotalWorkingYears),
MeanJobLevel = mean(JobLevel),
MeanPerformance = mean(PerformanceRating),
MeanYearsSincePromotion = mean(YearsInCurrentRole),
MeanSalaryHike = mean(PercentSalaryHike),
Total = n()) %>%
arrange(Attrition,Total)
kable(AttritionStats,position = "left")| Attrition | MeanAge | MeanMonthlyIncome | MeanWorkingYears | MeanJobLevel | MeanPerformance | MeanYearsSincePromotion | MeanSalaryHike | Total |
|---|---|---|---|---|---|---|---|---|
| No | 37.41233 | 6702.000 | 11.602740 | 2.116438 | 3.149315 | 4.453425 | 15.17534 | 730 |
| Yes | 33.78571 | 4764.786 | 8.185714 | 1.635714 | 3.164286 | 2.907143 | 15.32857 | 140 |
TravelStats = Attrition %>%
group_by(Attrition,BusinessTravel) %>%
summarize(TotalPop= n(),
MeanWorkAge = mean(Age),
MeanWorkLifeBalance = mean(WorkLifeBalance),
MeanJobSatisfaction = mean(JobSatisfaction),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanMonthlyIncome = mean(MonthlyIncome),
MeanYearsAtCompany = mean(YearsAtCompany),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanJobLevel = mean(JobLevel),
MeanPerformance = mean(PerformanceRating),
MeanYearsSincePromotion = mean(YearsInCurrentRole),
MeanSalaryHike = mean(PercentSalaryHike),
Total = n()) %>%
arrange(Attrition,Total)
kable(TravelStats,position = "left")| Attrition | BusinessTravel | TotalPop | MeanWorkAge | MeanWorkLifeBalance | MeanJobSatisfaction | MeanTotalWorkingYears | MeanMonthlyIncome | MeanYearsAtCompany | MeanJobLevel | MeanPerformance | MeanYearsSincePromotion | MeanSalaryHike | Total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| No | Non-Travel | 83 | 37.54217 | 2.843374 | 2.795181 | 10.530121 | 5820.578 | 7.024096 | 1.963855 | 3.132530 | 4.156626 | 15.28916 | 83 |
| No | Travel_Frequently | 123 | 37.92683 | 2.804878 | 2.959350 | 12.406504 | 6750.512 | 8.113821 | 2.186992 | 3.162602 | 4.829268 | 15.04878 | 123 |
| No | Travel_Rarely | 524 | 37.27099 | 2.805344 | 2.709924 | 11.583970 | 6830.227 | 7.154580 | 2.124046 | 3.148855 | 4.412214 | 15.18702 | 524 |
| Yes | Non-Travel | 11 | 31.81818 | 2.545454 | 1.909091 | 7.181818 | 5385.727 | 3.818182 | 1.909091 | 3.272727 | 2.272727 | 16.54545 | 11 |
| Yes | Travel_Frequently | 35 | 32.48571 | 2.857143 | 2.714286 | 6.942857 | 3623.000 | 4.800000 | 1.400000 | 3.200000 | 2.514286 | 15.28571 | 35 |
| Yes | Travel_Rarely | 94 | 34.50000 | 2.563830 | 2.393617 | 8.765957 | 5117.255 | 5.500000 | 1.691489 | 3.138298 | 3.127660 | 15.20213 | 94 |
JobRoleStats = Attrition %>%
group_by(Attrition,JobRole) %>%
summarize(TotalPop= n(),
MeanWorkAge = mean(Age),
MeanWorkLifeBalance = mean(WorkLifeBalance),
MeanJobSatisfaction = mean(JobSatisfaction),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanMonthlyIncome = mean(MonthlyIncome),
MeanYearsAtCompany = mean(YearsAtCompany),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanJobLevel = mean(JobLevel),
MeanPerformance = mean(PerformanceRating),
MeanYearsSincePromotion = mean(YearsInCurrentRole),
MeanSalaryHike = mean(PercentSalaryHike),
Total = n()) %>%
arrange(Attrition,Total)
kable(JobRoleStats,position = "left")| Attrition | JobRole | TotalPop | MeanWorkAge | MeanWorkLifeBalance | MeanJobSatisfaction | MeanTotalWorkingYears | MeanMonthlyIncome | MeanYearsAtCompany | MeanJobLevel | MeanPerformance | MeanYearsSincePromotion | MeanSalaryHike | Total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| No | Human Resources | 21 | 35.71429 | 2.952381 | 2.714286 | 7.095238 | 3527.905 | 5.333333 | 1.285714 | 3.190476 | 3.2380952 | 15.57143 | 21 |
| No | Sales Representative | 29 | 32.24138 | 2.758621 | 2.827586 | 5.275862 | 2849.207 | 3.379310 | 1.103448 | 3.103448 | 2.5862069 | 15.51724 | 29 |
| No | Manager | 47 | 47.36170 | 2.765957 | 2.531915 | 24.808511 | 17163.362 | 13.446808 | 4.319149 | 3.212766 | 6.2765957 | 15.06383 | 47 |
| No | Research Director | 50 | 43.74000 | 2.860000 | 2.480000 | 20.940000 | 15674.000 | 10.000000 | 3.860000 | 3.080000 | 5.6200000 | 14.96000 | 50 |
| No | Healthcare Representative | 68 | 39.17647 | 2.676471 | 2.838235 | 13.397059 | 7323.176 | 8.338235 | 2.426471 | 3.147059 | 4.7058824 | 15.42647 | 68 |
| No | Manufacturing Director | 85 | 38.05882 | 2.858824 | 2.741176 | 12.235294 | 7494.471 | 7.952941 | 2.458823 | 3.211765 | 5.4941176 | 15.75294 | 85 |
| No | Laboratory Technician | 123 | 34.82114 | 2.861789 | 2.739837 | 8.268293 | 3310.496 | 5.821138 | 1.292683 | 3.154472 | 3.5934959 | 14.87805 | 123 |
| No | Research Scientist | 140 | 34.97143 | 2.700000 | 2.900000 | 8.035714 | 3337.043 | 5.300000 | 1.214286 | 3.114286 | 3.4642857 | 15.25000 | 140 |
| No | Sales Executive | 167 | 36.73653 | 2.880240 | 2.784431 | 11.149701 | 6802.311 | 7.706587 | 2.317365 | 3.149701 | 4.8982036 | 14.92216 | 167 |
| Yes | Research Director | 1 | 41.00000 | 3.000000 | 3.000000 | 23.000000 | 19545.000 | 22.000000 | 5.000000 | 3.000000 | 15.0000000 | 12.00000 | 1 |
| Yes | Manufacturing Director | 2 | 46.50000 | 2.500000 | 2.000000 | 19.500000 | 7962.000 | 5.500000 | 2.500000 | 3.000000 | 4.5000000 | 13.50000 | 2 |
| Yes | Manager | 4 | 49.75000 | 2.750000 | 2.250000 | 23.250000 | 17594.250 | 17.750000 | 4.500000 | 3.000000 | 8.2500000 | 14.00000 | 4 |
| Yes | Human Resources | 6 | 28.50000 | 3.000000 | 2.000000 | 2.666667 | 2433.167 | 2.000000 | 1.000000 | 3.000000 | 0.8333333 | 13.50000 | 6 |
| Yes | Healthcare Representative | 8 | 39.87500 | 2.625000 | 2.750000 | 16.875000 | 8388.750 | 12.125000 | 2.750000 | 3.125000 | 5.6250000 | 14.25000 | 8 |
| Yes | Sales Representative | 24 | 28.37500 | 3.000000 | 2.541667 | 3.416667 | 2415.542 | 2.375000 | 1.000000 | 3.083333 | 1.5416667 | 14.54167 | 24 |
| Yes | Laboratory Technician | 30 | 31.30000 | 2.366667 | 2.466667 | 6.266667 | 2858.433 | 3.300000 | 1.166667 | 3.200000 | 2.1666667 | 15.63333 | 30 |
| Yes | Research Scientist | 32 | 33.84375 | 2.656250 | 2.375000 | 6.468750 | 2919.344 | 4.281250 | 1.062500 | 3.312500 | 2.0312500 | 17.06250 | 32 |
| Yes | Sales Executive | 33 | 36.48485 | 2.515151 | 2.424242 | 11.000000 | 7344.545 | 6.696970 | 2.424242 | 3.121212 | 4.0303030 | 14.90909 | 33 |
OvertimeStats = Attrition %>%
group_by(Attrition,OverTime) %>%
summarize(TotalPop= n(),
MeanWorkAge = mean(Age),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanMonthlyIncome = mean(MonthlyIncome),
MeanYearsAtCompany = mean(YearsAtCompany),
MeanTotalWorkingYears = mean(TotalWorkingYears),
MeanJobLevel = mean(JobLevel),
MeanPerformance = mean(PerformanceRating),
MeanYearsSincePromotion = mean(YearsInCurrentRole),
MeanSalaryHike = mean(PercentSalaryHike),
Total = n()) %>%
arrange(Attrition,Total)
kable(OvertimeStats,position = "left")| Attrition | OverTime | TotalPop | MeanWorkAge | MeanTotalWorkingYears | MeanMonthlyIncome | MeanYearsAtCompany | MeanJobLevel | MeanPerformance | MeanYearsSincePromotion | MeanSalaryHike | Total |
|---|---|---|---|---|---|---|---|---|---|---|---|
| No | Yes | 172 | 38.27907 | 12.087209 | 7000.343 | 7.860465 | 2.203488 | 3.116279 | 4.697674 | 14.94767 | 172 |
| No | No | 558 | 37.14516 | 11.453405 | 6610.038 | 7.129032 | 2.089606 | 3.159498 | 4.378136 | 15.24552 | 558 |
| Yes | No | 60 | 33.78333 | 8.866667 | 5110.067 | 6.300000 | 1.750000 | 3.133333 | 3.350000 | 14.81667 | 60 |
| Yes | Yes | 80 | 33.78750 | 7.675000 | 4505.825 | 4.362500 | 1.550000 | 3.187500 | 2.575000 | 15.71250 | 80 |
Attrition %>% ggplot(aes(x = Attrition,y = TotalWorkingYears)) +
geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Years in Workforce")Attrition %>% ggplot(aes(x = Attrition,y = MonthlyRate)) +
geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Monthly Rate") Attrition %>% ggplot(aes(x = Attrition,y = YearsInCurrentRole)) +
geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Years in Current Role")TravelStats %>% ggplot(aes(x = Attrition, y = TotalPop, fill=BusinessTravel)) +
geom_bar(stat="identity") + ggtitle("Attrition based on Travel")TravelStats %>%
ggplot(aes(x = reorder(BusinessTravel,TotalPop), y = TotalPop, fill=Attrition)) +
geom_bar(stat="identity") + coord_flip() +
ggtitle("Business Travel Turnover") + xlab("Travel") + ylab("Total Population")OvertimeStats %>% ggplot(aes(x = Attrition, y = TotalPop,fill = OverTime)) + geom_bar(stat="identity") +
ggtitle("Overtime vs No Overtime") Attrition %>% ggplot(aes(x=Age, y=MonthlyIncome)) +
geom_point(aes(x=Age, y=MonthlyIncome, color=Attrition))+
ggtitle('Age vs Monthly Income') +
scale_color_discrete(name="Attrition") +
facet_wrap(~Attrition, scales="free")Attrition %>% ggplot(aes(x = reorder(JobRole,MonthlyIncome), y = MonthlyIncome,fill=JobRole))+
geom_boxplot()+
coord_flip() +
ggtitle('Job Role vs Monthly Income') +xlab("Job Role") + ylab("Monthly Income")JobRoleStatsYes = JobRoleStats %>% filter(Attrition=="Yes") %>% arrange(-TotalPop)
JobRoleStatsNo = JobRoleStats %>% filter(Attrition=="No") %>% arrange(-TotalPop)
JobRoleStats %>% ggplot(aes(x = reorder(JobRole,MeanJobSatisfaction), y = MeanJobSatisfaction, fill=JobRole))+
geom_boxplot()+
coord_flip() +
ggtitle('Job Role vs Satisfaction') +xlab("Job Role") + ylab("Job Satisfaction")JobRoleStats %>% ggplot(aes(x = reorder(JobRole,MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole))+
geom_boxplot()+
coord_flip() +
ggtitle('Job Role vs Work Life Balance') +xlab("Job Role") + ylab("Work Life Balance")JobRoleStatsYes %>% ggplot(aes(x = reorder(JobRole,TotalPop), y = TotalPop, fill=JobRole)) + geom_bar(stat="identity") +
coord_flip() + ggtitle("Attrition by Job Role") + xlab("Job Role") + ylab("Total Population")JobRoleStatsYes %>% ggplot(aes(x = reorder(JobRole,-MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole)) + geom_bar(stat="identity") +
coord_flip() + ggtitle("Work Life Balance by Job Role") + xlab("Job Role") + ylab("Total Population")JobRoleStatsNo %>% ggplot(aes(x = reorder(JobRole,TotalPop), y = TotalPop, fill=JobRole)) + geom_bar(stat="identity") +
coord_flip() + ggtitle("Attrition by Job Role") + xlab("Job Role") + ylab("Total Population")JobRoleStatsNo %>% ggplot(aes(x = reorder(JobRole,-MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole)) + geom_bar(stat="identity") +
coord_flip() + ggtitle("Work Life Balance by Job Role") + xlab("Job Role") + ylab("Total Population")p <- plot_ly(Attrition, x = ~TotalWorkingYears, y = ~Age, z = ~MonthlyIncome, color = ~JobRole) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'WorkingYears'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'MonthyIncome')))
p#Perform Classification using KNN
#Model Data using KNN for Age and Monthly Income
##Plot Relationship Betwen Age and Monthly Income
Attrition %>% ggplot(aes(x = Age,y=MonthlyIncome,color = Attrition)) + geom_point() + xlab("Age") + geom_smooth(method="lm") + ylab("Monthly Income") +
ggtitle("Relationship Between Age and Monthly Income") +
scale_y_continuous(label=comma)#Split out training/test data - 70/30
set.seed(100)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]
accs = data.frame(accuracy = numeric(90), k = numeric(90))
#Formulate the optimal k value for KNN
for(i in 1:90)
{
classifications = knn(train[,c(2,20)],test[,c(2,20)],train$Attrition, prob = TRUE, k = i)
table(test$Attrition,classifications)
CM = confusionMatrix(table(test$Attrition,classifications))
accs$accuracy[i] = CM$overall[1]
accs$k[i] = i
}
plot(accs$k,accs$accuracy, type = "l", xlab = "k")
abline(v=accs$k[which.max(accs$accuracy)], col="red")accs$k[which.max(accs$accuracy)]## [1] 8
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]
classification = knn(Attrition[,c(2,20)],Attrition[,c(2,20)],Attrition$Attrition,prob = TRUE, k = 5)
table(classification,Attrition$Attrition)##
## classification No Yes
## No 717 118
## Yes 13 22
confusionMatrix(table(classification,Attrition$Attrition))## Confusion Matrix and Statistics
##
##
## classification No Yes
## No 717 118
## Yes 13 22
##
## Accuracy : 0.8494
## 95% CI : (0.8239, 0.8725)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.2176
##
## Kappa : 0.1999
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9822
## Specificity : 0.1571
## Pos Pred Value : 0.8587
## Neg Pred Value : 0.6286
## Prevalence : 0.8391
## Detection Rate : 0.8241
## Detection Prevalence : 0.9598
## Balanced Accuracy : 0.5697
##
## 'Positive' Class : No
##
#Perform Classification using Random Forest
#Model Data with Random Forest
##Read in Data
Attrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2_data.csv", header = TRUE)
NOAttrition= read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Attrition.csv", header = TRUE)
#Ensure Attrition is changed to factor
Attrition$Attrition = factor(as.character(Attrition$Attrition), levels=c("Yes", "No"))
Attrition_Variables = randomForest(Attrition~ .-MonthlyIncome,
data=Attrition, ntree=1000,
keep.forest=FALSE,
importance=TRUE)
varImpPlot(Attrition_Variables)#Split test and train data - 70/30
set.seed(3033)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]
#Apply Random Forest using Monthly Income to test data
EmpAtt = randomForest(Attrition ~ .-Age,
data=train,
strata=train$Attrition,
sampsize= c(60,60))
#Use newly trained data set to predict test set
AttPredict = predict(EmpAtt,
newdata= test)
#Create confusion matrix to assess accuracy stats
confusionMatrix(AttPredict, test$Attrition)## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 25 30
## No 11 195
##
## Accuracy : 0.8429
## 95% CI : (0.793, 0.8849)
## No Information Rate : 0.8621
## P-Value [Acc > NIR] : 0.838542
##
## Kappa : 0.4593
##
## Mcnemar's Test P-Value : 0.004937
##
## Sensitivity : 0.69444
## Specificity : 0.86667
## Pos Pred Value : 0.45455
## Neg Pred Value : 0.94660
## Prevalence : 0.13793
## Detection Rate : 0.09579
## Detection Prevalence : 0.21073
## Balanced Accuracy : 0.78056
##
## 'Positive' Class : Yes
##
#Apply Random Forest to the output file
EmpAtt2 = randomForest(Attrition ~ .-Age,
data=Attrition,
strata=Attrition$Attrition,
sampsize= c(60,60))
AttPredict2 = predict(EmpAtt2,
newdata= NOAttrition)
EmpAttPreds = data.frame(NOAttrition$ID, AttPredict2)
#EmpAttPreds
#write.csv(EmpAttritionPreds, "/Users/Kevin/Desktop/School/Doing Data Science/Project 2/Case2PredictionsAlbrightAttrition.csv")Linear Regression Model vs Random Forest for Predicting Salary
#Linear Regression Model
NOSalary= read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Salary.csv", header = TRUE)
ggplot(data = Attrition, aes(x = Age, y = MonthlyIncome)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
theme(panel.background = element_rect(fill = "white"),
axis.line.x=element_line(),
axis.line.y=element_line()) +
ggtitle("Linear Model Fitted to Data") +
scale_y_continuous(label=comma)set.seed(100)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
trainSalary = Attrition[trainAttrition,]
testSalary = Attrition[-trainAttrition,]
fit_1 <- lm(MonthlyIncome ~ Age,
data = trainSalary)
summary(fit_1)##
## Call:
## lm(formula = MonthlyIncome ~ Age, data = trainSalary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9430 -2720 -705 2035 12653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2328.48 728.79 -3.195 0.00147 **
## Age 237.71 19.14 12.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4175 on 607 degrees of freedom
## Multiple R-squared: 0.2026, Adjusted R-squared: 0.2013
## F-statistic: 154.2 on 1 and 607 DF, p-value: < 2.2e-16
ggplot(data = Attrition, aes(x = Age, y = MonthlyIncome)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
theme(panel.background = element_rect(fill = "white"),
axis.line.x=element_line(),
axis.line.y=element_line()) +
ggtitle("Linear Model Fitted to Data") +
scale_y_continuous(label=comma)SalaryPreds = predict(fit_1, NOSalary)
RMSE(Attrition$MonthlyIncome, SalaryPreds)## [1] 5008.04
#data.frame(NOSalary$ID,SalaryPreds)
#Random Forest Model
#Train Data
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
trainSalary = Attrition[trainAttrition,]
testSalary = Attrition[-trainAttrition,]
str(trainSalary)## 'data.frame': 609 obs. of 36 variables:
## $ ID : int 395 530 124 478 596 744 619 258 867 302 ...
## $ Age : int 42 56 38 26 33 23 39 30 32 35 ...
## $ Attrition : Factor w/ 2 levels "Yes","No": 2 1 2 2 2 2 2 2 2 2 ...
## $ BusinessTravel : chr "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
## $ DailyRate : int 1271 441 243 775 586 160 613 855 976 853 ...
## $ Department : chr "Research & Development" "Research & Development" "Sales" "Sales" ...
## $ DistanceFromHome : int 2 14 7 29 1 4 6 7 26 18 ...
## $ Education : int 1 4 4 2 3 1 1 4 4 5 ...
## $ EducationField : chr "Medical" "Life Sciences" "Marketing" "Medical" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 875 161 709 618 855 1735 2062 1428 333 74 ...
## $ EnvironmentSatisfaction : int 2 2 4 1 1 3 4 4 3 2 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ HourlyRate : int 35 72 46 45 48 51 42 73 100 71 ...
## $ JobInvolvement : int 3 3 2 3 4 3 2 3 3 3 ...
## $ JobLevel : int 1 1 2 2 2 1 3 2 2 3 ...
## $ JobRole : chr "Research Scientist" "Research Scientist" "Sales Executive" "Sales Executive" ...
## $ JobSatisfaction : int 4 2 4 3 1 2 1 1 4 1 ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Divorced" ...
## $ MonthlyIncome : int 2515 4963 4028 4306 4037 3295 9991 4779 4465 9069 ...
## $ MonthlyRate : int 9068 4510 7791 4267 21816 12862 21457 12761 12069 11031 ...
## $ NumCompaniesWorked : int 5 9 0 5 1 1 4 7 0 1 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "Yes" "Yes" "No" "No" ...
## $ PercentSalaryHike : int 14 18 20 12 22 13 15 14 18 22 ...
## $ PerformanceRating : int 3 3 4 3 4 3 3 3 3 4 ...
## $ RelationshipSatisfaction: int 4 1 1 1 1 3 1 2 1 4 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 3 0 2 1 0 1 2 0 1 ...
## $ TotalWorkingYears : int 8 7 8 8 9 3 9 8 4 9 ...
## $ TrainingTimesLastYear : int 2 2 2 5 5 3 5 3 2 3 ...
## $ WorkLifeBalance : int 3 3 3 3 3 1 3 3 3 2 ...
## $ YearsAtCompany : int 2 5 7 0 9 3 7 3 3 9 ...
## $ YearsInCurrentRole : int 1 4 7 0 8 2 7 2 2 8 ...
## $ YearsSinceLastPromotion : int 2 4 0 0 0 1 1 0 2 1 ...
## $ YearsWithCurrManager : int 2 3 5 0 8 2 7 2 2 8 ...
EmpRandoSalary = randomForest(MonthlyIncome ~ .-Age,
data=trainSalary)
PredictSalary = predict(EmpRandoSalary,
newdata= testSalary)
#Test Data
RMSE(testSalary$MonthlyIncome, PredictSalary)## [1] 1263.272
PredDF = data.frame(MonthlyIncome=testSalary$MonthlyIncome, PredictSalary)
PredDF %>% ggplot(aes(x=MonthlyIncome,y=PredictSalary)) + geom_point(aes(x=MonthlyIncome,y=PredictSalary)) +
ggtitle("Relationship Between Predicted and Actuals") + xlab("Actual") + ylab("Predicted") + geom_smooth(method="lm") #data.frame(testSalary$ID, PredictSalary)
#RMSE of Random Forest model for predicting salary
NOSalary$Attrition = factor(as.character(NOSalary$Attrition), levels=c("Yes", "No"))
SalaryRF <- randomForest(MonthlyIncome ~ .-Age, data=trainSalary)
#importance(SalaryRF)
#varImpPlot(SalaryRF)
#Test on provided salary data set
#str(NOSalary)
#str(Attrition)
PredictSalary2<- predict(SalaryRF,
newdata= NOSalary,
importance=TRUE,ntree=500)
summary(PredictSalary2)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2089 3222 5278 6232 6284 18023
PredictionsAlbright <- data.frame(NOSalary$ID, PredictSalary2)
write.csv(PredictionsAlbright,"/Users/Kevin/Desktop/School/Doing Data Science/Project 2/AlbrightSalaryPredictions.csv")